Берем данные

library(tidyverse)
sign <- read_csv2("refactored_table.csv")
sign %>% 
  filter(non_iconic == "0") %>% 
  select(4, 6, 1, 2, 8:11) %>% 
  slice(-c(32, 1164)) %>% # убиваю два неиконичных набоюления, они будут ломать MCA
  mutate_all(funs(factor)) -> 
  sign
sign
summary(sign)
##  form-image_assocaition_pattern                    languages   
##  object  :876                   Turkish Sign Language   : 129  
##  tracing :512                   Latvian Sign Language   : 117  
##  handling:324                   Estonian Sign Language  : 112  
##  contour :168                   Italian Sign Language   : 112  
##  ?       :  3                   Lithuanian Sign Language: 111  
##  hangling:  1                   Polish Sign Language    : 111  
##  (Other) :  3                   (Other)                 :1195  
##           word          semantic_field localization personification
##  underground:  39   animals    :298    0:1253       0:1503         
##  knife      :  38   clothes    :188    1: 634       1: 384         
##  fork       :  36   food       :220                                
##  banana     :  35   house      :216                                
##  bread      :  34   instruments:255                                
##  helicopter :  34   nature     :403                                
##  (Other)    :1671   transport  :307                                
##  action  parts-wholes
##  0:958   0:1482      
##  1:929   1: 405      
##                      
##                      
##                      
##                      
## 

MCA

Проведем MCA. Какой процент объясненной дисперсии содержат новые компоненты?

sign_mca <- MASS::mca(sign[,-c(1:3)], nf = 8)
data_frame(pca = factor(paste0("PC", seq_along(sign_mca$d)), 
                        levels = paste0("PC", seq_along(sign_mca$d))),
           eigenvalue = sign_mca$d^2,
           perc_var = round(sign_mca$d^2/sum(sign_mca$d^2), 3),
           cum_perc_var = cumsum(perc_var)) %>% 
  arrange(desc(perc_var)) ->
  sign_mca_results

sign_mca_results %>% 
  gather(method, value, perc_var:cum_perc_var) %>%
  mutate(method = factor(method, levels = c("perc_var", "cum_perc_var"))) %>% 
  ggplot(aes(pca, value, label = round(value, 2)))+
  geom_col(fill = "lightblue")+
  geom_text(aes(y = value + 0.05))+
  facet_grid(~method, scales = "free_x")+
  theme_bw()+
  labs(x = "", y = "",
       title = "Impact of each PC in MCA model")

Первые две переменные описывают 48% дисперсии. Для MCA это высокий процент. Посмотрим, как распеделены исследуемые единицы в новом пространстве. Построим отдельный график для переменных:

# data frame for ggplot
cats = apply(sign[,-c(1:3)], 2, function(x) nlevels(as.factor(x)))
data.frame(sign_mca$cs,  Variable = rep(names(cats), cats)) %>% 
  ggplot(aes(x = X1, y = X2, label = rownames(.))) +
   geom_hline(yintercept = 0, colour = "gray70") +
   geom_vline(xintercept = 0, colour = "gray70") +
   geom_text(aes(colour = Variable)) +
   ggtitle("MCA plot of variables using R package MASS")+
  theme_bw()+
  scale_x_continuous(limits = c(-0.015, 0.015))

Нам потом придется придумать, почему те или иные переменные группируются рядом.

Следующий график с наблюдениями, раскрасим по паттерну tracing-handling:

# data frame for ggplot
data.frame(sign_mca$rs,  word = sign$word, language = sign$languages, type = sign$`form-image_assocaition_pattern`) %>% 
  ggplot(aes(x = X1, y = X2, label = word, color = type)) +
   geom_hline(yintercept = 0, colour = "gray70") +
   geom_vline(xintercept = 0, colour = "gray70") +
   geom_point() +
  stat_ellipse()+
   ggtitle("MCA plot of observations using R package MASS")+
  theme_bw()

Следующий график с наблюдениями, раскрасим по языку:

# data frame for ggplot
data.frame(sign_mca$rs,  word = sign$word, language = sign$languages, type = sign$`form-image_assocaition_pattern`) %>% 
  ggplot(aes(x = X1, y = X2, label = word, color = language)) +
   geom_hline(yintercept = 0, colour = "gray70") +
   geom_vline(xintercept = 0, colour = "gray70") +
   geom_point() +
  stat_ellipse()+
   ggtitle("MCA plot of observations using R package MASS")+
  theme_bw()

Следующий график с наблюдениями, посмотрим на слова:

# data frame for ggplot
data.frame(sign_mca$rs,  word = sign$word, language = sign$languages, type = sign$`form-image_assocaition_pattern`) %>% 
  ggplot(aes(x = X1, y = X2, label = word, color = type)) +
   geom_hline(yintercept = 0, colour = "gray70") +
   geom_vline(xintercept = 0, colour = "gray70") +
   geom_text() +
   ggtitle("MCA plot of observations using R package MASS")+
  theme_bw() ->
  plot

library(plotly)
ggplotly(plot)

Они конечно, налезают друг на друга…

Раскрасим по семантике:

data.frame(sign_mca$rs,  word = sign$word, language = sign$languages, type = sign$`form-image_assocaition_pattern`, semantic = sign$semantic_field) %>% 
  ggplot(aes(x = X1, y = X2, label = word, color = semantic)) +
   geom_hline(yintercept = 0, colour = "gray70") +
   geom_vline(xintercept = 0, colour = "gray70") +
   geom_point() +
  stat_ellipse()+
   ggtitle("MCA plot of observations using R package MASS")+
  theme_bw()
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure